Lab 10 Objectives:

PART 0. Load libraries

library(tidyverse) # The tidyverse!
## ── Attaching packages ──────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(corrplot) # For correlation matrices
## corrplot 0.84 loaded
library(janitor) # For cleaning up column names
library(lubridate) # For dealing with dates & times
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(gganimate) # For adding animation to ggplot graphics
library(ggimage) # For updating graph components with images

PART 1. Correlation matrix (World Bank world environmental factors data)

Compiled World Bank data, accessed from: https://www.kaggle.com/zanderventer/environmental-variables-for-world-countries#World_countries_env_vars.csv

env_var <- read_csv("world_env_vars.csv") %>% 
  na.omit
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character()
## )
## See spec(...) for full column specifications.
cor_df <- cor(env_var[2:28])

corrplot(cor_df,
         type = "upper",
         method = "ellipse",
         tl.col = "black",
         tl.cex = .5) #only shows "upper" half of symmetrical plot, directionality showed by "ellipse" shape, text label color is "black", text label size is ".5"

PART 2. Binary Logistic Regression (Donner Party Data)

Use the ‘glm’ function for fitting generalized linear models (the logit - log odds of survival, in our case, will be linearly related to Sex and Age. So we expect the final model to look something like this:

\[Log Odds (Survival) = \beta_0 + \beta_1(Age) + \beta_2(Sex)\]

We’ll use ‘family = binomial’ to run binomial logistic regression…otherwise, this looks very similar to other types of regression we’ve already done.

  1. Read in the DonnerTable.csv file as DonnerTable
DonnerTable <- read_csv("DonnerTable.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   Age = col_integer(),
##   Outcome = col_integer(),
##   Sex = col_character(),
##   Family.name = col_character(),
##   Status = col_character()
## )
  1. Binomial logistic regression

1 = survival 0 = death

donner_blr <- glm(Outcome ~ Sex + Age, family = "binomial", data = DonnerTable)

summary(donner_blr)
## 
## Call:
## glm(formula = Outcome ~ Sex + Age, family = "binomial", data = DonnerTable)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8828  -1.0383   0.6511   1.0261   1.7386  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.62180    0.50279   3.226  0.00126 **
## SexMale     -1.06798    0.48229  -2.214  0.02680 * 
## Age         -0.03561    0.01525  -2.336  0.01952 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 120.86  on 87  degrees of freedom
## Residual deviance: 108.87  on 85  degrees of freedom
##   (3 observations deleted due to missingness)
## AIC: 114.87
## 
## Number of Fisher Scoring iterations: 4
  1. Use the model to answer: What are the log odds of survival for a 5 year-old female? The probability of survival?
# 1. Create a data frame with variables Sex and Age, containing data "Female" and 5, respectively: 

f_5 <- data.frame(Sex = "Female", Age = 5)

# 2. Find the log odds of survival for the new data (5 year old female) using predict() function with type = "link":

f_5_logodds <- predict(donner_blr, newdata = f_5, type = "link")

# 3. Exponentiate the log odds to find ODDS of survival for a 5 year old female:

exp(f_5_logodds)
##       1 
## 4.23666
# Ask: Could we manually find the probability of survival for a 5-year old female? recall: p/(1-p) = ODDS --> p = 80%

# 4. Actually, let's just use type = "response" in the predict function, which converts to a probability for us:

f_5_prob <- predict(donner_blr, newdata = f_5, type = "response")
f_5_prob
##         1 
## 0.8090386
  1. What is the probability of survival for a 25 year-old male?
# Similarly:

m_25 <- data.frame(Sex = "Male", Age = 25) # Make a new data frame

m_25_prob <- predict(donner_blr, newdata = m_25, type = "response") # Find probability of survival
m_25_prob
##         1 
## 0.4167068
  1. Create new sequences of data so that we can graph probabilities for the entire spectrum of ages, designated by sex.
seq_age <- rep(seq(from = 0, to = 100), 2) # Create a sequence from 0 to 100, twice (one will be "Male" and one will be "Female")

f_101 <- rep("Female", 101) # Repeat 'Female' 101 times (to match years data)
m_101 <- rep("Male", 101) # Repeat 'Male' 101 times
mf_101 <- c(f_101, m_101) # Combine them into a single vector

# Combine the age and sex sequences into a single data frame - that will be the new data that we have our model make predictions for

donner_newdata <- data.frame(seq_age, mf_101) # MUST make column names match variables in the model!
colnames(donner_newdata) <- c("Age","Sex")
  1. Now that we have new data to put into our model to have it make predictions, let’s go ahead and actually find the predicted probabilities for each Age/Sex combination.
# Find probabilities using predict (with type = "response"). Include SE.

predicted_probs <- predict(donner_blr, newdata = donner_newdata, type = "response", se.fit = TRUE)

# Coerce outcome into data frame. 

graph_data <- data.frame(donner_newdata, predicted_probs$fit, predicted_probs$se.fit)

colnames(graph_data) <- c("Age", "Sex", "Probability", "SE")
  1. Graph results.
ggplot(graph_data, aes(x = Age, y = Probability)) +
  geom_line(aes(color = Sex)) +
  geom_ribbon(aes(ymin = Probability - SE, ymax = Probability + SE, fill = Sex, alpha = .5))

Our graph corroborates our results from our model: females have higher probability of survival than males, and younger ages have higher probability tha older ages.

PART 3. Solar irradiation at the 3 locations in 2010 (pull in all together, do some lubridate stuff, etc.)

  1. Read in multiple solar irradiation files (for SB (CA), Hilo (HI), and Fairbanks (AK)):
si_full <- list.files(pattern = "solar_irradiation_*") %>% 
  map_df(~read_csv(.)) %>% 
  clean_names()
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   `YYYY-MM-DD` = col_character(),
##   `HH:MM (LST)` = col_time(format = ""),
##   `Zenith (deg)` = col_double(),
##   `Azimuth (deg)` = col_double(),
##   `Precip Wat (cm)` = col_double(),
##   `AOD (unitless)` = col_double(),
##   `AOD RAN (unitless)` = col_double(),
##   `Ozone (cm)` = col_double(),
##   `Albedo (unitless)` = col_double(),
##   site = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   `YYYY-MM-DD` = col_character(),
##   `HH:MM (LST)` = col_time(format = ""),
##   `Zenith (deg)` = col_double(),
##   `Azimuth (deg)` = col_double(),
##   `Precip Wat (cm)` = col_double(),
##   `AOD (unitless)` = col_double(),
##   `AOD RAN (unitless)` = col_double(),
##   `Ozone (cm)` = col_double(),
##   `Albedo (unitless)` = col_double(),
##   site = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   `YYYY-MM-DD` = col_character(),
##   `HH:MM (LST)` = col_time(format = ""),
##   `Zenith (deg)` = col_double(),
##   `Azimuth (deg)` = col_double(),
##   `Precip Wat (cm)` = col_double(),
##   `AOD (unitless)` = col_double(),
##   `AOD RAN (unitless)` = col_double(),
##   `Ozone (cm)` = col_double(),
##   `Albedo (unitless)` = col_double(),
##   site = col_character()
## )
## See spec(...) for full column specifications.
  1. Wrangle the data
solar_tidy <- si_full %>% 
  rename(
    sol_rad = etr_wh_m_2,
    date = yyyy_mm_dd,
    time = hh_mm_lst
  ) %>% 
  filter(time != "NA") %>%
  mutate(site = fct_relevel(site, "Hawaii", "Santa Barbara", "Alaska"))
  1. Use lubridate() functions to convert to times/dates
solar_tidy$date <- mdy(solar_tidy$date)
solar_tidy$time <-hms(solar_tidy$time)
  1. Make an awesome figure of solar irradiation (heat/tile map)
solar_gg <- ggplot(solar_tidy, aes(x = date, y = time)) +
  geom_tile(aes(fill = sol_rad)) +
    scale_fill_gradientn(colors = c("royalblue2", "mediumorchid1", "orange", "yellow")) +
  scale_y_time() +
  facet_grid(site ~ .)
solar_gg

PART 4. gganimate example: total aquaculture production (metric tons) for United States, Brazil, Chile, and Ecuador

  1. Get the data, and do some wrangling:
aq_df <-read_csv("aq_wb.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   country = col_character(),
##   code = col_character(),
##   indicator = col_character(),
##   ind_code = col_character(),
##   `1960` = col_integer(),
##   `1961` = col_integer(),
##   `1962` = col_integer(),
##   `1963` = col_integer(),
##   `1964` = col_integer(),
##   `1965` = col_integer(),
##   `1966` = col_integer(),
##   `1967` = col_integer(),
##   `1968` = col_integer(),
##   `1969` = col_integer(),
##   `1970` = col_integer(),
##   `1971` = col_integer(),
##   `1972` = col_integer(),
##   `1973` = col_integer(),
##   `1974` = col_integer(),
##   `1975` = col_integer()
##   # ... with 9 more columns
## )
## See spec(...) for full column specifications.
aq_tidy <- aq_df %>% 
  filter(country == "Brazil" |
           country == "Chile" |
           country == "Ecuador" |
           country == "United States") %>% 
  gather(year, aq_prod, `1960`:`2014`) %>% 
  filter(year >= 1990) %>% 
  mutate(aq_mil = aq_prod/1000000) %>% 
  select(country, year, aq_mil)
  #gather() creates a new column called year, and a new column called aq_prod, based on data from columns 1960-2014 (i.e., converts data from wide format to long format) (n.b., spread() converts data from long format to wide format)
  1. Read in the fish.png as ‘fish’
fish <-"fish.png"
  1. Make a graph…with gganimate!
aq_plot <- ggplot(aq_tidy, aes(x = as.numeric(year), y = aq_mil, group = country)) +
  geom_line(aes(color = country)) +
  geom_point(aes(color = country)) +
  geom_image(aes(image = fish)) +
  geom_text(aes(label = country, color = country), position = position_nudge(y = .04, x = 1), size = 5) +
  transition_reveal(country, as.numeric(year))
## Warning: Ignoring unknown parameters: image_colour
aq_plot

END LAB